data preprocessing

In [1]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelBinarizer
In [2]:
df_train_full = pd.read_pickle("./pan19_df_clean_train_feateng.pkl")
df_test_full = pd.read_pickle("./pan19_df_clean_test_feateng.pkl")
print(f"train size: {df_train_full.shape}, test size: {df_test_full.shape}")
train size: (412000, 4), test size: (264000, 4)
In [3]:
def feature(df) :
    df['word_count'] = df['tweet'].apply(lambda x : len(x.split()))
    df['char_count'] = df['tweet'].apply(lambda x : len(x.replace(" ","")))
    df['word_density'] = df['word_count'] / (df['char_count'] + 1)
    df['total_length'] = df['tweet'].apply(len)
    df['capitals'] = df['tweet'].apply(lambda tweet: sum(1 for c in tweet if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
    df['num_exclamation_marks'] =df['tweet'].apply(lambda x: x.count('!'))
    df['num_question_marks'] = df['tweet'].apply(lambda x: x.count('?'))
    df['num_punctuation'] = df['tweet'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
    df['num_symbols'] = df['tweet'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
    df['num_unique_words'] = df['tweet'].apply(lambda x: len(set(w for w in x.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['word_count']
    df["word_unique_percent"] =  df["num_unique_words"]*100/df['word_count']
    df['num_retweet'] = df['clean_tweet'].apply(lambda x: x.count('rt'))
    df['num_url'] = df['clean_tweet'].apply(lambda x: x.count('URL_TOKEN'))
    df['num_number'] = df['clean_tweet'].apply(lambda x: x.count('NUM_TOKEN'))
    return df
In [4]:
## PAZI ovdje uzimam samo subset za lakse igranje
#num_examples = 10_000
#df_train = feature(df_train_full.loc[list(range(5*num_examples)), :])
#df_test = feature(df_test_full.loc[list(range(num_examples)), :])
df_train = feature(df_train_full)
df_test = feature(df_test_full)

df = pd.concat((df_train_full.copy(), df_test_full.copy()))
print(f"total size: {df.shape}")
total size: (676000, 20)
In [5]:
df_train.info()
df_train
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412000 entries, 0 to 411999
Data columns (total 20 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   author                 412000 non-null  object 
 1   tweet                  412000 non-null  object 
 2   bot                    412000 non-null  object 
 3   clean_tweet            412000 non-null  object 
 4   word_count             412000 non-null  int64  
 5   char_count             412000 non-null  int64  
 6   word_density           412000 non-null  float64
 7   total_length           412000 non-null  int64  
 8   capitals               412000 non-null  int64  
 9   caps_vs_length         412000 non-null  float64
 10  num_exclamation_marks  412000 non-null  int64  
 11  num_question_marks     412000 non-null  int64  
 12  num_punctuation        412000 non-null  int64  
 13  num_symbols            412000 non-null  int64  
 14  num_unique_words       412000 non-null  int64  
 15  words_vs_unique        412000 non-null  float64
 16  word_unique_percent    412000 non-null  float64
 17  num_retweet            412000 non-null  int64  
 18  num_url                412000 non-null  int64  
 19  num_number             412000 non-null  int64  
dtypes: float64(4), int64(12), object(4)
memory usage: 62.9+ MB
Out[5]:
author tweet bot clean_tweet word_count char_count word_density total_length capitals caps_vs_length num_exclamation_marks num_question_marks num_punctuation num_symbols num_unique_words words_vs_unique word_unique_percent num_retweet num_url num_number
0 47b0c91666f8e47948049eb1fc2202f9 Town &amp; Country Magazine Uninvited Monica L... bot town country magazine uninvited monica lewinsk... 40 269 0.148148 308 41 0.133117 0 0 8 3 22 0.550000 55.000000 0 2 0
1 47b0c91666f8e47948049eb1fc2202f9 Lead Business Systems Analyst – Automated Test... bot lead business system analyst automated testing... 35 268 0.130112 302 37 0.122517 0 0 12 0 33 0.942857 94.285714 1 2 2
2 47b0c91666f8e47948049eb1fc2202f9 Senior Software Engineer, Business Process Eng... bot senior software engineer business process engi... 33 263 0.125000 295 25 0.084746 0 0 9 0 32 0.969697 96.969697 0 2 0
3 47b0c91666f8e47948049eb1fc2202f9 Technical Support Engineer: Technical Support ... bot technical support engineer technical support e... 35 269 0.129630 303 35 0.115512 0 0 9 0 30 0.857143 85.714286 2 2 1
4 47b0c91666f8e47948049eb1fc2202f9 Mueller questioned Novartis payment to Trump l... bot mueller questioned novartis payment trump lawy... 37 264 0.139623 300 33 0.110000 0 0 6 0 24 0.648649 64.864865 3 2 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
411995 70ddcdd222a02d2df685e67a38b903d7 A little more persistence, a little more effor... bot little persistence little effort seemed hopele... 20 108 0.183486 127 3 0.023622 0 0 3 0 18 0.900000 90.000000 2 0 0
411996 70ddcdd222a02d2df685e67a38b903d7 Perseverance is a great element of success. If... bot perseverance great element success knock long ... 26 116 0.222222 141 3 0.021277 0 0 3 0 24 0.923077 92.307692 0 0 0
411997 70ddcdd222a02d2df685e67a38b903d7 Profit is a by-product of work; happiness is i... bot profit product work happiness chief product he... 13 63 0.203125 75 3 0.040000 0 0 2 0 12 0.923077 92.307692 0 0 0
411998 70ddcdd222a02d2df685e67a38b903d7 Quality questions create a quality life. Succe... bot quality question create quality life successfu... 21 116 0.179487 136 4 0.029412 0 0 4 0 19 0.904762 90.476190 0 0 0
411999 70ddcdd222a02d2df685e67a38b903d7 He who has a strong enough why can bear almost... bot ha strong enough bear almost friedrich nietzsche 14 63 0.218750 76 3 0.039474 0 0 1 0 14 1.000000 100.000000 0 0 0

412000 rows × 20 columns

In [6]:
import plotly.graph_objects as go

fig = go.Figure(
    data=[
        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "bot"]['num_url'].mean() * 100,
               df_train.loc[df_train["bot"] == "bot"]['num_number'].mean() * 100,
               df_train.loc[df_train["bot"] == "bot"]['num_retweet'].mean() * 100],
            theta=['urls', "nums", "Number of retweets"],
            fill='toself',
            line=dict(color='red'),
            name="bot stats means", subplot="polar3"),

        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "human"]['num_url'].mean() * 100,
               df_train.loc[df_train["bot"] == "human"]['num_number'].mean() * 100,
               df_train.loc[df_train["bot"] == "human"]['num_retweet'].mean() * 100],
            theta=['urls', "nums", "Number of retweets"],
            fill='toself',
            line=dict(color='blue'),
            name="human stats means", subplot="polar3"),

        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "bot"]['total_length'].mean(),
                df_train.loc[df_train["bot"] == "bot"]['char_count'].mean(),
                df_train.loc[df_train["bot"] == "bot"]['word_count'].mean()],
            theta=['Total_Lenght', 'Charcount', "word count"],
            fill='toself',
            line=dict(color='brown'),
            name="bot stats means", subplot="polar3"),

        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "human"]['total_length'].mean(),
                df_train.loc[df_train["bot"] == "human"]['char_count'].mean(),
                df_train.loc[df_train["bot"] == "human"]['word_count'].mean()],
            theta=['Total_Lenght', 'Charcount', "word count"],
            fill='toself',
            line=dict(color='magenta'),
            name="human stats means", subplot="polar3")
    ],
    layout=go.Layout(
        polar3=dict(
            domain=dict(
                x=[1, 1],
                y=[1, 1]
            ),
            radialaxis=dict(visible=True,)),
        polar2=dict(
            domain=dict(
                x=[0, 0.3],
                y=[0, 0.45]
            ),
            radialaxis=dict(visible=True,)),

        polar=dict(
            domain=dict(
                x=[0.33, 0.6525],
                y=[0, 0.45]
            ),
            radialaxis=dict(visible=True,)),

        polar4=dict(
            domain=dict(
                x=[0.33, 0.6525],
                y=[0.55, 1]
            ),
            radialaxis=dict(visible=True,)),

        polar5=dict(
            domain=dict(
                x=[0.6775, 1],
                y=[0, 0.45]
            ),
            radialaxis=dict(visible=True,)),
        polar6=dict(
            domain=dict(
                x=[0.6775, 1],
                y=[0.55, 1]
            ),
            radialaxis=dict(visible=True,))
    )
)

fig.show()
In [7]:
def draw_word_features(df_train):
    plot = go.Figure(
    data=[
        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "bot"]['word_count'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['char_count'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['word_density'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['total_length'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['capitals'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['caps_vs_length'].mean()],
            theta=['mean word count', "mean char count", "mean word denisty",
                   "mean tweet length", "mean capital letter count", "mean capital vs length"],
            fill='toself',
            line=dict(color='red'),
            name="bot means", subplot="polar6"),

        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "human"]['word_count'].mean(),
               df_train.loc[df_train["bot"] == "human"]['char_count'].mean(),
               df_train.loc[df_train["bot"] == "human"]['word_density'].mean(),
               df_train.loc[df_train["bot"] == "human"]['total_length'].mean(),
               df_train.loc[df_train["bot"] == "human"]['capitals'].mean(),
               df_train.loc[df_train["bot"] == "human"]['caps_vs_length'].mean()],
            theta=['mean word count', "mean char count", "mean word denisty",
                   "mean tweet length", "mean capital letter count", "mean capital vs length"],
            fill='toself',
            line=dict(color='blue'),
            name="human means", subplot="polar6"),
    ],
    layout=go.Layout(
        polar=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        polar2=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),

        polar3=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        
        polar4=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        
        polar5=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        
        polar6=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        )
    )
    return plot

fig = draw_word_features(df)
fig.show()
In [8]:
def draw_url_rt_num(df_train):
    plot = go.Figure(
    data=[
        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "bot"]['num_url'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['num_number'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['num_retweet'].mean()],
            theta=['mean URL', "mean NUM", "mean RT"],
            fill='toself',
            line=dict(color='red'),
            name="bot means", subplot="polar3"),

        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "human"]['num_url'].mean(),
               df_train.loc[df_train["bot"] == "human"]['num_number'].mean(),
               df_train.loc[df_train["bot"] == "human"]['num_retweet'].mean()],
            theta=['mean URL', "mean NUM", "mean RT"],
            fill='toself',
            line=dict(color='blue'),
            name="human means", subplot="polar3"),
    ],
    layout=go.Layout(
        polar=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        polar2=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),

        polar3=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        )
    )
    return plot

fig = draw_url_rt_num(df)
fig.show()
In [9]:
def draw_uniques(df_train):
    plot = go.Figure(
    data=[
        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "bot"]['num_unique_words'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['words_vs_unique'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['word_unique_percent'].mean()],
            theta=['mean unique words', "mean words vs unique", "mean words unique percent"],
            fill='toself',
            line=dict(color='red'),
            name="bot values", subplot="polar3"),

        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "human"]['num_unique_words'].mean(),
               df_train.loc[df_train["bot"] == "human"]['words_vs_unique'].mean(),
               df_train.loc[df_train["bot"] == "human"]['word_unique_percent'].mean()],
            theta=['mean unique words', "mean words vs unique", "mean words unique percent"],
            fill='toself',
            line=dict(color='blue'),
            name="human values", subplot="polar3"),
    ],
    layout=go.Layout(
        polar=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        polar2=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),

        polar3=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        )
    )
    return plot

fig = draw_uniques(df)
fig.show()
In [10]:
def draw_punctuation(df_train):
    freq_plot = go.Figure(
    data=[
        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "bot"]['num_exclamation_marks'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['num_question_marks'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['num_punctuation'].mean(),
               df_train.loc[df_train["bot"] == "bot"]['num_symbols'].mean()],
            theta=['mean exclamation marks', "mean question marks", "mean punctuation", "mean symbols"],
            fill='toself',
            line=dict(color='red'),
            name="bot values", subplot="polar4"),

        go.Scatterpolar(
            r=[df_train.loc[df_train["bot"] == "human"]['num_exclamation_marks'].mean(),
               df_train.loc[df_train["bot"] == "human"]['num_question_marks'].mean(),
               df_train.loc[df_train["bot"] == "human"]['num_punctuation'].mean(),
               df_train.loc[df_train["bot"] == "human"]['num_symbols'].mean()],
            theta=['mean exclamation marks', "mean question marks", "mean punctuation", "mean symbols"],
            fill='toself',
            line=dict(color='blue'),
            name="human values", subplot="polar4"),
    ],
    layout=go.Layout(
        polar=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        polar2=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),

        polar3=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        polar4=dict(
            domain=dict(
                x=[0, 1],
                y=[0, 1]
            ),
            radialaxis=dict(visible=True,)),
        )
    )
    return freq_plot
fig = draw_punctuation(df)
fig.show()
In [11]:
word_features = ["word_count", "char_count", "word_density", "total_length", "capitals", "caps_vs_length"]
punctuation_features = ["num_exclamation_marks", "num_question_marks", "num_punctuation", "num_symbols"]
uniques_features = ["num_unique_words", "words_vs_unique", "word_unique_percent"]
means_features = ["num_retweet", "num_url", "num_number"]

all_new_features = word_features + punctuation_features + uniques_features + means_features
In [12]:
for feat in all_new_features:
    bot_mean = df.loc[df["bot"] == "bot"][feat].mean()
    bot_std = df.loc[df["bot"] == "bot"][feat].std()
    print(f"{feat}[bot] mean: {bot_mean} | stddev: {bot_std}")
        
    human_mean = df.loc[df["bot"] == "human"][feat].mean()
    human_std = df.loc[df["bot"] == "human"][feat].std()
    print(f"{feat}[human] mean: {human_mean} | stddev: {human_std}")
    
    print(f"mean absolute difference: {abs(bot_mean - human_mean)} | stddev absolute difference: {abs(bot_std - human_std)}")
    print()
word_count[bot] mean: 17.092402366863904 | stddev: 11.161447100720757
word_count[human] mean: 15.67326923076923 | stddev: 9.157090300989807
mean absolute difference: 1.4191331360946737 | stddev absolute difference: 2.0043567997309495

char_count[bot] mean: 106.04140532544379 | stddev: 70.91087747333623
char_count[human] mean: 89.32908875739645 | stddev: 44.473702445375984
mean absolute difference: 16.712316568047342 | stddev absolute difference: 26.437175027960244

word_density[bot] mean: 0.1631583227101447 | stddev: 0.050439161635511834
word_density[human] mean: 0.16923527970625796 | stddev: 0.04649282976566136
mean absolute difference: 0.00607695699611327 | stddev absolute difference: 0.003946331869850474

total_length[bot] mean: 122.04418047337278 | stddev: 81.311742182314
total_length[human] mean: 103.97819822485206 | stddev: 52.90192401944772
mean absolute difference: 18.06598224852071 | stddev absolute difference: 28.409818162866287

capitals[bot] mean: 11.965964497041421 | stddev: 12.43364792581487
capitals[human] mean: 8.096807692307692 | stddev: 6.658129720819837
mean absolute difference: 3.869156804733729 | stddev absolute difference: 5.775518204995033

caps_vs_length[bot] mean: 0.10026423919359877 | stddev: 0.08878711950630992
caps_vs_length[human] mean: 0.08519310724465506 | stddev: 0.06809658550382333
mean absolute difference: 0.015071131948943703 | stddev absolute difference: 0.02069053400248659

num_exclamation_marks[bot] mean: 0.11435502958579882 | stddev: 0.4669559215551203
num_exclamation_marks[human] mean: 0.25847041420118344 | stddev: 0.7793960198805322
mean absolute difference: 0.14411538461538462 | stddev absolute difference: 0.3124400983254119

num_question_marks[bot] mean: 0.07214497041420119 | stddev: 0.3294151775549764
num_question_marks[human] mean: 0.11409467455621301 | stddev: 0.4285528028333596
mean absolute difference: 0.04194970414201182 | stddev absolute difference: 0.09913762527838321

num_punctuation[bot] mean: 3.778713017751479 | stddev: 3.047978828966403
num_punctuation[human] mean: 2.572301775147929 | stddev: 2.079331518688385
mean absolute difference: 1.2064112426035503 | stddev absolute difference: 0.9686473102780182

num_symbols[bot] mean: 0.3063136094674556 | stddev: 2.3876022406943758
num_symbols[human] mean: 0.08513609467455621 | stddev: 0.3892728590537352
mean absolute difference: 0.2211775147928994 | stddev absolute difference: 1.9983293816406404

num_unique_words[bot] mean: 15.621736686390532 | stddev: 9.400164807768299
num_unique_words[human] mean: 14.964594674556214 | stddev: 8.277901326970238
mean absolute difference: 0.6571420118343188 | stddev absolute difference: 1.1222634807980612

words_vs_unique[bot] mean: 0.9435032418757333 | stddev: 0.0944753266547895
words_vs_unique[human] mean: 0.9702862899184289 | stddev: 0.052587012206589484
mean absolute difference: 0.0267830480426956 | stddev absolute difference: 0.041888314448200015

word_unique_percent[bot] mean: 94.35032418757332 | stddev: 9.447532665478947
word_unique_percent[human] mean: 97.02862899184288 | stddev: 5.258701220658949
mean absolute difference: 2.6783048042695583 | stddev absolute difference: 4.188831444819998

num_retweet[bot] mean: 0.3046005917159763 | stddev: 0.6066768881030227
num_retweet[human] mean: 0.5055680473372781 | stddev: 0.6635166335858295
mean absolute difference: 0.2009674556213018 | stddev absolute difference: 0.05683974548280679

num_url[bot] mean: 0.8754289940828403 | stddev: 0.7997643466140476
num_url[human] mean: 0.46603254437869823 | stddev: 0.5773802112242618
mean absolute difference: 0.409396449704142 | stddev absolute difference: 0.22238413538978585

num_number[bot] mean: 0.4203165680473373 | stddev: 1.1000740914937575
num_number[human] mean: 0.21944970414201184 | stddev: 0.6272861882201396
mean absolute difference: 0.20086686390532546 | stddev absolute difference: 0.47278790327361786

In [13]:
sns.set(style="whitegrid")

def plot_violins_all(dataframe):
    for feature in all_new_features:
        ax = sns.violinplot(x="bot",
                            y=feature,
                            hue="bot",
                            data=dataframe)
        plt.show()

plot_violins_all(df)
In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
In [15]:
ct_full = ColumnTransformer([
    ("all_features", StandardScaler(), all_new_features),
])
df[all_new_features] = ct_full.fit_transform(df)
In [16]:
ct_words = ColumnTransformer([
    ("word_cnt", StandardScaler(), all_new_features),
])

df_train[all_new_features] = ct_words.fit_transform(df_train)
#print(f"train_transformed:\n{df_train}")

df_test[all_new_features] = ct_words.transform(df_test)
#print(f"\n\ntest_transformed:\n{df_test_transformed}")
#df_test
df_train
Out[16]:
author tweet bot clean_tweet word_count char_count word_density total_length capitals caps_vs_length num_exclamation_marks num_question_marks num_punctuation num_symbols num_unique_words words_vs_unique word_unique_percent num_retweet num_url num_number
0 47b0c91666f8e47948049eb1fc2202f9 Town &amp; Country Magazine Uninvited Monica L... bot town country magazine uninvited monica lewinsk... 2.314154 2.866896 -0.391454 2.820237 3.146854 0.614797 -0.29383 -0.248441 1.789318 1.670115 0.750228 -6.132534 -6.132534 -0.632829 1.857048 -0.354966
1 47b0c91666f8e47948049eb1fc2202f9 Lead Business Systems Analyst – Automated Test... bot lead business system analyst automated testing... 1.824393 2.850209 -0.760248 2.733655 2.742487 0.462912 -0.29383 -0.248441 3.270997 -0.113504 1.989912 -0.260526 -0.260526 0.920412 1.857048 1.859102
2 47b0c91666f8e47948049eb1fc2202f9 Senior Software Engineer, Business Process Eng... bot senior software engineer business process engi... 1.628488 2.766769 -0.864762 2.632643 1.529385 -0.078280 -0.29383 -0.248441 2.159738 -0.113504 1.877213 0.140647 0.140647 -0.632829 1.857048 -0.354966
3 47b0c91666f8e47948049eb1fc2202f9 Technical Support Engineer: Technical Support ... bot technical support engineer technical support e... 1.824393 2.866896 -0.770101 2.748086 2.540303 0.362542 -0.29383 -0.248441 2.159738 -0.113504 1.651816 -1.541691 -1.541691 2.473653 1.857048 0.752068
4 47b0c91666f8e47948049eb1fc2202f9 Mueller questioned Novartis payment to Trump l... bot mueller questioned novartis payment trump lawy... 2.020297 2.783457 -0.565775 2.704795 2.338120 0.283571 -0.29383 -0.248441 1.048479 -0.113504 0.975625 -4.658039 -4.658039 4.026895 1.857048 -0.354966
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
411995 70ddcdd222a02d2df685e67a38b903d7 A little more persistence, a little more effor... bot little persistence little effort seemed hopele... 0.355111 0.180144 0.331101 0.208352 -0.694636 -0.954081 -0.29383 -0.248441 -0.062781 -0.113504 0.299434 -0.901109 -0.901109 2.473653 -0.910344 -0.354966
411996 70ddcdd222a02d2df685e67a38b903d7 Perseverance is a great element of success. If... bot perseverance great element success knock long ... 0.942823 0.313647 1.123132 0.410376 -0.694636 -0.987688 -0.29383 -0.248441 -0.062781 -0.113504 0.975625 -0.556179 -0.556179 -0.632829 -0.910344 -0.354966
411997 70ddcdd222a02d2df685e67a38b903d7 Profit is a by-product of work; happiness is i... bot profit product work happiness chief product he... -0.330555 -0.570812 0.732653 -0.542024 -0.694636 -0.719412 -0.29383 -0.248441 -0.433201 -0.113504 -0.376757 -0.556179 -0.556179 -0.632829 -0.910344 -0.354966
411998 70ddcdd222a02d2df685e67a38b903d7 Quality questions create a quality life. Succe... bot quality question create quality life successfu... 0.453063 0.313647 0.249332 0.338225 -0.593544 -0.871124 -0.29383 -0.248441 0.307639 -0.113504 0.412133 -0.829933 -0.829933 -0.632829 -0.910344 -0.354966
411999 70ddcdd222a02d2df685e67a38b903d7 He who has a strong enough why can bear almost... bot ha strong enough bear almost friedrich nietzsche -0.232602 -0.570812 1.052136 -0.527594 -0.694636 -0.726954 -0.29383 -0.248441 -0.803620 -0.113504 -0.151360 0.593584 0.593584 -0.632829 -0.910344 -0.354966

412000 rows × 20 columns

In [ ]:
 
In [17]:
# Draw frames after scaling
# draw train data only
frame = df_train

fig = draw_word_features(frame)
fig.show()
fig = draw_url_rt_num(frame)
fig.show()
fig = draw_uniques(frame)
fig.show()
fig = draw_punctuation(frame)
fig.show()
In [18]:
# Draw frames after scaling
# draw test data only
frame = df_test

fig = draw_word_features(frame)
fig.show()
fig = draw_url_rt_num(frame)
fig.show()
fig = draw_uniques(frame)
fig.show()
fig = draw_punctuation(frame)
fig.show()
In [19]:
# Plot dataframe full
frame = df

fig = draw_word_features(frame)
fig.show()
fig = draw_url_rt_num(frame)
fig.show()
fig = draw_uniques(frame)
fig.show()
fig = draw_punctuation(frame)
fig.show()
In [20]:
plot_violins_all(df)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [21]:
df_train.to_pickle("./pan19_df_clean_train_full_features.pkl")
df_test.to_pickle("./pan19_df_clean_test_full_features.pkl")